The aim of analyzing the provided crime and temperature data is to gain insights into the relationship between various factors such as weather conditions and crime incidents.Explore patterns and trends in crime incidents.This will help identify areas with higher crime rates and understand the nature of incidents that occur.Provide recommendations for crime prevention: Based on our analysis, we aim to identify potential strategies or interventions that can help mitigate the impact of weather-related factors on crime.
library(dplyr)
library(tidyr)
library(stringr)
library(tidytext)
library(e1071)
library(mlbench)
library(ggplot2)
library(gridExtra)
We have loaded the data of the temperature in which all the date , temperature , wind and its speed , pressure at sea level in hectopascals (hPa) , total cloud cover in octas , low cloud cover in octas , sunshine during in hour of that day , Visibility in kilometers, Snow depth in centimeters these are few column
df<- read.csv("temp2023.csv")
head(df)
df$Date <- as.Date(df$Date, format="%m/%d/%Y")
df
df_avg <- df %>%
group_by(month = format(Date, "%m"), year = format(Date, "%Y")) %>%
summarise(
across(
where(is.numeric),
mean,
na.rm = TRUE
),
.groups = 'drop'
) %>%
mutate(date_formatted = paste(year, month, sep="-"))
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
ℹ In group 1: `month = NA`, `year = NA`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_avg$date_formatted
[1] "NA-NA"
df_avg%>%head(10)
NA
df1<- read.csv("crime23.csv")
df1%>%head(10)
NA
merged_data <- merge(df_avg, df1, by.x="date_formatted", by.y="date", all=TRUE)
merged_data%>%head(10)
names(merged_data)
[1] "date_formatted" "month" "year" "station_ID" "TemperatureCAvg" "TemperatureCMax"
[7] "TemperatureCMin" "TdAvgC" "HrAvg" "WindkmhInt" "WindkmhGust" "PresslevHp"
[13] "Precmm" "TotClOct" "lowClOct" "SunD1h" "VisKm" "SnowDepcm"
[19] "category" "persistent_id" "lat" "long" "street_id" "street_name"
[25] "context" "id" "location_type" "location_subtype" "outcome_status"
merged_data %>%
select(date_formatted) %>%
count(date_formatted, sort = TRUE, name = "Count")%>%
ggplot(aes(factor(date_formatted), Count, group = 1)) +
geom_bar(stat = "identity", fill = "red", color = "black") +
geom_line(color = "black") +
geom_smooth() +
geom_point(color = "green") +
geom_text(aes(label = Count, vjust = -0.3), color = "black") +
scale_x_discrete(guide = guide_axis(angle = 90)) +
labs(x = "Month", y = "count of Incident categeory") +
theme_minimal()
summary(merged_data)
date_formatted month year station_ID TemperatureCAvg TemperatureCMax TemperatureCMin
Length:6879 Length:6879 Length:6879 Min. :3590 Min. :10.92 Min. :15.13 Min. :6.365
Class :character Class :character Class :character 1st Qu.:3590 1st Qu.:10.92 1st Qu.:15.13 1st Qu.:6.365
Mode :character Mode :character Mode :character Median :3590 Median :10.92 Median :15.13 Median :6.365
Mean :3590 Mean :10.92 Mean :15.13 Mean :6.365
3rd Qu.:3590 3rd Qu.:10.92 3rd Qu.:15.13 3rd Qu.:6.365
Max. :3590 Max. :10.92 Max. :15.13 Max. :6.365
NA's :6878 NA's :6878 NA's :6878 NA's :6878
TdAvgC HrAvg WindkmhInt WindkmhGust PresslevHp Precmm TotClOct lowClOct
Min. :7.578 Min. :81.25 Min. :16.81 Min. :40.87 Min. :1014 Min. :1.866 Min. :4.988 Min. :6.443
1st Qu.:7.578 1st Qu.:81.25 1st Qu.:16.81 1st Qu.:40.87 1st Qu.:1014 1st Qu.:1.866 1st Qu.:4.988 1st Qu.:6.443
Median :7.578 Median :81.25 Median :16.81 Median :40.87 Median :1014 Median :1.866 Median :4.988 Median :6.443
Mean :7.578 Mean :81.25 Mean :16.81 Mean :40.87 Mean :1014 Mean :1.866 Mean :4.988 Mean :6.443
3rd Qu.:7.578 3rd Qu.:81.25 3rd Qu.:16.81 3rd Qu.:40.87 3rd Qu.:1014 3rd Qu.:1.866 3rd Qu.:4.988 3rd Qu.:6.443
Max. :7.578 Max. :81.25 Max. :16.81 Max. :40.87 Max. :1014 Max. :1.866 Max. :4.988 Max. :6.443
NA's :6878 NA's :6878 NA's :6878 NA's :6878 NA's :6878 NA's :6878 NA's :6878 NA's :6878
SunD1h VisKm SnowDepcm category persistent_id lat long
Min. :5.127 Min. :32.11 Min. :1 Length:6879 Length:6879 Min. :51.88 Min. :0.8793
1st Qu.:5.127 1st Qu.:32.11 1st Qu.:1 Class :character Class :character 1st Qu.:51.89 1st Qu.:0.8964
Median :5.127 Median :32.11 Median :1 Mode :character Mode :character Median :51.89 Median :0.9014
Mean :5.127 Mean :32.11 Mean :1 Mean :51.89 Mean :0.9030
3rd Qu.:5.127 3rd Qu.:32.11 3rd Qu.:1 3rd Qu.:51.89 3rd Qu.:0.9088
Max. :5.127 Max. :32.11 Max. :1 Max. :51.90 Max. :0.9246
NA's :6878 NA's :6878 NA's :6878 NA's :1 NA's :1
street_id street_name context id location_type location_subtype outcome_status
Min. :2152702 Length:6879 Mode:logical Min. :107582824 Length:6879 Length:6879 Length:6879
1st Qu.:2153025 Class :character NA's:6879 1st Qu.:109309182 Class :character Class :character Class :character
Median :2153158 Mode :character Median :111497486 Mode :character Mode :character Mode :character
Mean :2153877 Mean :111301793
3rd Qu.:2153365 3rd Qu.:113746477
Max. :2343256 Max. :115699577
NA's :1 NA's :1
merged_data %>%
select(category) %>%
count(category, sort = TRUE, name = "Count")
# Create a subset of merged_data containing only "street_name" and "category" columns
subset_data <- merged_data[, c("date_formatted", "category")]
# Remove rows with missing values
subset_data <- na.omit(subset_data)
# Plot a bar graph
barplot(table(subset_data$date_formatted, subset_data$category),
main = "Category Distribution by date_formatted",
xlab = "category",
ylab = "Count",
las = 2, # Rotate x-axis labels vertically
col = rainbow(length(unique(subset_data$category))), # Assign different colors to each category
legend = TRUE) # Show legend
NA
NA
# Calculate the count and percentage for each category
d <- merged_data %>%
count(category) %>%
mutate(perc = round(100 * n / sum(n)))
# Plot the pie chart
ggplot(data = d, aes(x = "", y = n, fill = category)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(perc, "%")), position = position_stack(vjust = 0.5)) +
labs(fill = 'Category', x = NULL, y = NULL, title = 'Incidents by Category', subtitle = 'Percentage of Total') +
coord_polar(theta = "y") +
theme_minimal()
# Remove rows with NA values
cleaned_data <- na.omit(merged_data[, c("TemperatureCAvg", "TemperatureCMin", "TemperatureCMax", "SnowDepcm")])
# Dot plot with custom color palette and summary statistics
p <- ggplot(cleaned_data, aes(x = SnowDepcm, color = "TemperatureCAvg")) +
geom_point(aes(y = TemperatureCAvg), size = 3) +
geom_point(aes(y = TemperatureCMin, color = "TemperatureCMin"), size = 3) +
geom_point(aes(y = TemperatureCMax, color = "TemperatureCMax"), size = 3) +
scale_color_manual(values = c("TemperatureCAvg" = "#999999", "TemperatureCMin" = "#E69F00", "TemperatureCMax" = "#56B4E9")) +
labs(x = "Snow Depth (cm)", y = "Temperature (°C)", color = "Variables") +
theme_minimal() +
stat_summary(aes(y = TemperatureCAvg), fun = mean, geom = "point", shape = 18, size = 3, color = "red") + # Add mean points
stat_summary(aes(y = TemperatureCAvg), fun = median, geom = "point", shape = 18, size = 3, color = "blue") # Add median points
# Preview the plot with custom color palette and summary statistics
p
print(colnames(merged_data))
[1] "date_formatted" "month" "year" "station_ID" "TemperatureCAvg" "TemperatureCMax"
[7] "TemperatureCMin" "TdAvgC" "HrAvg" "WindkmhInt" "WindkmhGust" "PresslevHp"
[13] "Precmm" "TotClOct" "lowClOct" "SunD1h" "VisKm" "SnowDepcm"
[19] "category" "persistent_id" "lat" "long" "street_id" "street_name"
[25] "context" "id" "location_type" "location_subtype" "outcome_status"
library(patchwork)
# Histogram for WindkmhInt
hist_windkmhInt <- ggplot(merged_data, aes(x = WindkmhInt)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.6) +
labs(title = "WindSpeed(km/h)-Int", x = "Wind Speed (km/h)", y = "Frequency")
# Histogram for WindkmhGust
hist_windkmhGust <- ggplot(merged_data, aes(x = WindkmhGust)) +
geom_histogram(binwidth = 1, fill = "lightgreen", color = "black", alpha = 0.6) +
labs(title = "Wind Speed(km/h)-Gust", x = "Wind Speed (km/h)", y = "Frequency")
# Histogram for VisKm
hist_VisKm <- ggplot(merged_data, aes(x = VisKm)) +
geom_histogram(binwidth = 1, fill = "salmon", color = "black", alpha = 0.6) +
labs(title = "Visibility(km)", x = "Visibility (km)", y = "Frequency")
# Arrange histograms together
hist_windkmhInt + hist_windkmhGust + hist_VisKm + plot_layout(ncol = 3)
merged_data <- merged_data[!is.na(merged_data$outcome_status), ]
summary(merged_data$outcome_status)
Length Class Mode
6201 character character
# Load the ggplot2 package
library(ggplot2)
# Create a scatter plot with jittered points
sinaplot <- ggplot(merged_data, aes(x = street_id, y = category)) +
geom_jitter(width = 0.3, height = 0.3) + # Jitter the points to avoid overlap
labs(x = "street_id", y = "Category", title = "Sinaplot of Outcome Status and Category")
# Print the sinaplot
print(sinaplot)
library(ggplot2)
# Create a jitter plot
ggplot(merged_data, aes(x = date_formatted, y = category)) +
geom_jitter(alpha = 0.5) +
labs(x = "Date Formatted", y = "category", title = "Relation between Date Formatted and Street Name") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Rotate x-axis labels
# Calculate the correlation matrix
correlation_matrix <- cor(merged_data[, c("TemperatureCAvg", "TemperatureCMax", "TemperatureCMin", "SnowDepcm")])
# Visualize the correlation matrix using a heatmap
library(ggplot2)
# Convert the correlation matrix to a data frame
correlation_df <- as.data.frame(as.table(correlation_matrix))
# Plot the heatmap
ggplot(correlation_df, aes(Var1, Var2, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
coord_fixed()
NA
NA
print(colnames(merged_data))
[1] "date_formatted" "month" "year" "station_ID" "TemperatureCAvg" "TemperatureCMax"
[7] "TemperatureCMin" "TdAvgC" "HrAvg" "WindkmhInt" "WindkmhGust" "PresslevHp"
[13] "Precmm" "TotClOct" "lowClOct" "SunD1h" "VisKm" "SnowDepcm"
[19] "category" "persistent_id" "lat" "long" "street_id" "street_name"
[25] "context" "id" "location_type" "location_subtype" "outcome_status"
# Load necessary libraries
library(plotly)
library(dplyr)
# Filter out rows with missing street names or categories
filtered_data <- merged_data %>%
filter(!is.na(street_name) & !is.na(category))
# Create scatter plot
scatter_plot <- plot_ly(filtered_data, x = ~street_name, y = ~category, type = 'scatter', mode = 'markers') %>%
layout(title = 'Scatter Plot of Categories by Street Names',
xaxis = list(title = 'Street Name'),
yaxis = list(title = 'Category'),
hovermode = 'closest')
# Print the scatter plot
scatter_plot
# Load necessary libraries
library(plotly)
library(dplyr)
# Filter out rows with missing street names or categories
filtered_data <- merged_data %>%
filter(!is.na(street_name) & !is.na(category))
# Aggregate data to count occurrences of each category by street name
count_data <- filtered_data %>%
group_by(street_name, category) %>%
summarise(count = n()) %>%
ungroup()
`summarise()` has grouped output by 'street_name'. You can override using the `.groups` argument.
# Create scatter plot
scatter_plot <- plot_ly(count_data, x = ~street_name, y = ~category, z = ~count,
type = 'scatter3d', mode = 'markers', text = ~paste("Count: ", count)) %>%
layout(title = 'Scatter Plot of Categories by Street Names',
scene = list(xaxis = list(title = 'Street Name'),
yaxis = list(title = 'Category'),
zaxis = list(title = 'Count')),
hovermode = 'closest')
# Print the scatter plot
scatter_plot
NA
# Time series plot
ggplot(category_counts, aes(x = date_formatted, y = count, color = category, group = category)) +
geom_line() +
labs(title = "Crime Trends in Colchester (2023)",
x = "Date",
y = "Number of Crimes",
color = "Category") +
theme_minimal()
# Time series plot with smoothing
ggplot(category_counts, aes(x = date_formatted, y = count, color = category, group = category)) +
geom_line() +
geom_smooth(method = "auto", se = FALSE) + # Add smoothing line
labs(title = "Crime Trends in Colchester (2023)",
x = "Date",
y = "Number of Crimes",
color = "Category") +
theme_minimal()
colnames(merged_data) <- paste0("col", 1:5)
rownames(merged_data) <- paste0("row", 1:5)
Error in `.rowNamesDF<-`(x, value = value) : invalid 'row.names' length